
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# DISCLAIMER AND GENERAL INFORMATION
#
# File: SI7_topic_model.R
# Purpose: Produces results presented in section SI7
# Date: June 2025
# Data: ./data/open_ended.xlsx
#
# See 00_data_prep.R for technical disclaimer on R versions
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Libraries
library(tidyverse)
library(RColorBrewer) # for color palettes
library(tm) # Framework for text mining
library(RTextTools) # a machine learning package for text classification written in R
library(qdapDictionaries) # collection of text analysis dictionaries
library(SnowballC) # for stemming
library(matrixStats) # for statistics
library(igraph)
library(stm)

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# (A) Structural Topic Model ----
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


# Load data
survey.df <- readxl::read_excel("./data/open_ended.xlsx")
fulldocs_survey <- Corpus(VectorSource(survey.df$qual_effect_ccpol))

inspect(fulldocs_survey[16])

fulldocs_survey <- tm_map(fulldocs_survey, removeWords, stopwords("english"))

# Text pre-processing
su_dtm <- DocumentTermMatrix(fulldocs_survey,
                             control = list(tolower = TRUE,
                                            removePunctuation = TRUE,
                                            removeNumbers = TRUE,
                                            stopwords = TRUE,
                                            stemming = TRUE))

bt_freq <- colSums(as.matrix(su_dtm))

# order
bt_ord <- order(bt_freq)

# Least frequent terms
bt_freq[head(bt_ord)]
bt_freq[tail(bt_ord)]

head(table(bt_freq),15)
tail(table(bt_freq),15)

findFreqTerms(su_dtm, lowfreq=50)

findAssocs(su_dtm, "nation", 0.3)

inspect(su_dtm[1:5,300:310])


temp_surveyfull <- textProcessor(documents=survey.df$qual_effect_ccpol, metadata=survey.df, language = "english")
meta_full <- temp_surveyfull$meta
vocab_full <- temp_surveyfull$vocab #4436 words
docs_full <- temp_surveyfull$documents

# STM with different number of topics
model_full_4 <- stm(docs_full,vocab_full, 4, data=meta_full, seed = 15, max.em.its = 50)
model_full_3 <- stm(docs_full,vocab_full, 3, data=meta_full, seed = 15, max.em.its = 50)
model_full_5 <- stm(docs_full,vocab_full, 5, data=meta_full, seed = 15, max.em.its = 50)

labelTopics(model_full_4)
labelTopics(model_full_3)
labelTopics(model_full_5)

topicQuality(model=model_full_4, documents=docs_full)
topicQuality(model=model_full_3, documents=docs_full)
topicQuality(model=model_full_5, documents=docs_full)

mod.out.corr<-topicCorr(model_full_4)
plot.topicCorr(mod.out.corr)
mod.out.corr<-topicCorr(model_full_3)

# STM with four topics (Figure A28) ----
pdf(file="./plots/SI7_stm_4topics.pdf", height=5,width=9)
par(lwd=5, bty="n")
plot.STM(model_full_4,type="summary", n = 6, labeltype = "prob",
         main="",
         xlab="Expected topic proportions",
         xlim=c(0,1),
         axes=FALSE,
         lwd=5
         )
axis(side=1, at=seq(0,1,.1))
dev.off()

set.seed(1234)
prep <- estimateEffect(1:4 ~ residence, model_full_4, meta = meta_full, uncertainty = "Global")
summary(prep, topics = 1:4)

# STM regression (Figure A29) ----
pdf(file="./plots/SI7_stm1.pdf", height=5,width=9)
par(bty="n")
plot(prep, covariate = "residence", topics = c(1,2,3,4), 
     model = model_full_4, ci.level=.9, method = "difference", 
     cov.value1 = "Scotland", cov.value2 = "Wales", 
     main = "Effect of sample: Scotland [0] vs. Wales [1]", 
     labeltype = "custom", custom.labels = c("Topic 1", "Topic 2", "Topic 3", "Topic 4"),
     xlim = c(-0.06,0.06))
dev.off()

# STM regression (Figure A30) ----
pdf(file="./plots/SI7_stm2.pdf", height=5,width=9)
par(bty="n")
plot(prep, covariate = "residence", topics = c(1,2,3,4), 
     model = model_full_4, ci.level=.9, method = "difference", 
     cov.value1 = "Scotland", cov.value2 = "England", 
     main = "Effect of sample: Scotland [0] vs. Yorkshire and Cumbria [1]", 
     labeltype = "custom", custom.labels = c("Topic 1", "Topic 2", "Topic 3", "Topic 4"),
     xlim = c(-0.06, 0.06))
dev.off()



# STM regression (Figure A31) ----
pdf(file="./plots/SI7_stm3.pdf", height=5,width=9)
par(bty="n")
plot(prep, covariate = "residence", topics = c(1,2,3,4), 
     model = model_full_4, ci.level=.9, method = "difference", 
     cov.value1 = "Wales", cov.value2 = "England", 
     main = "Effect of sample: Wales [0] vs. Yorkshire and Cumbria [1]", 
     labeltype = "custom", custom.labels = c("Topic 1", "Topic 2", "Topic 3", "Topic 4"),
     xlim=c(-0.06, 0.06))
dev.off()



# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#                         END OF FILE
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

